#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
This file is part of the web2py Web Framework
Copyrighted by Massimo Di Pierro <mdipierro@cs.depaul.edu>
License: LGPLv3 (http://www.gnu.org/licenses/lgpl.html)

Created by Vladyslav Kozlovskyy (Ukraine) <dbdevelop©gmail.com>
       for Web2py project

Utilities and class for UTF8 strings managing
===========================================
"""
import __builtin__
__all__ = ['Utf8']

repr_escape_tab={}
for i in range(1,32): repr_escape_tab[i]=ur'\x%02x'%i
repr_escape_tab[7]=u'\\a'
repr_escape_tab[8]=u'\\b'
repr_escape_tab[9]=u'\\t'
repr_escape_tab[10]=u'\\n'
repr_escape_tab[11]=u'\\v'
repr_escape_tab[12]=u'\\f'
repr_escape_tab[13]=u'\\r'
repr_escape_tab[ord('\\')]=u'\\\\'
repr_escape_tab2=repr_escape_tab.copy()
repr_escape_tab2[ord('\'')]=u"\\'"

def sort_key(s):
    """ Unicode Collation Algorithm (UCA) (http://www.unicode.org/reports/tr10/)
        is used for utf-8 and unicode strings sorting and for utf-8 strings
        comparison

        NOTE: pyuca is a very memory cost module! It loads the whole
              "allkey.txt" file (~2mb!) into the memory. But this
              functionality is needed only when sort_key() is called as a
              part of sort() function or when Utf8 strings are compared.

        So, it is a lazy "sort_key" function which (ONLY ONCE, ON ITS
        FIRST CALL) imports pyuca and replaces itself with a real
        sort_key() function
    """
    global sort_key
    try:
        from contrib.pyuca import unicode_collator
        unicode_sort_key = unicode_collator.sort_key
        sort_key=lambda s: unicode_sort_key(
                unicode(s, 'utf-8') if isinstance(s, str) else s)
    except:
        sort_key=lambda s: (unicode(s, 'utf-8') if isinstance(s, str) else s).lower()
    return sort_key(s)


def ord(char):
    """ returns unicode id for utf8 or unicode *char* character

        SUPPOSE that *char* is an utf-8 or unicode character only
    """
    if isinstance(char, unicode): return __builtin__.ord(char)
    return __builtin__.ord(unicode(char, 'utf-8'))

def chr(code):
    """ return utf8-character with *code* unicode id """
    return Utf8(unichr(code))

def size(string):
    """ return length of utf-8 string in bytes
        NOTE! The length of correspondent utf-8
              string is returned for unicode string
    """
    return Utf8(string).__size__()

def truncate(string, length, dots='...'):
    """ returns string of length < *length* or truncate
        string with adding *dots* suffix to the string's end

    args:
         length (int): max length of string
         dots (str or unicode): string suffix, when string is cutted

     returns:
         (utf8-str): original or cutted string
    """
    text = unicode(string, 'utf-8')
    dots = unicode(dots, 'utf-8') if isinstance(dots, str) else dots
    if len(text) > length:
        text = text[:length-len(dots)] + dots
    return str.__new__(Utf8, text.encode('utf-8'))


class Utf8(str):
   """
   Class for utf8 string storing and manipulations

   The base presupposition of this class usage is:
   "ALL strings in the application are either of
   utf-8 or unicode type, even when simple str
   type is used. UTF-8 is only a "packed" version
   of unicode, so Utf-8 and unicode strings are
   interchangeable."

   CAUTION! This class is slower than str/unicode!
   Do NOT use it inside intensive loops. Simply
   decode string(s) to unicode before loop and
   encode it back to utf-8 string(s) after
   intensive calculation.

   You can see the benefit of this class in doctests() below
   """
   def __new__(cls, content='', codepage='utf-8'):
      if isinstance(content, unicode):
         return str.__new__(cls, unicode.encode(content, 'utf-8'))
      elif codepage in ('utf-8', 'utf8') or isinstance(content, cls):
         return str.__new__(cls, content)
      else:
         return str.__new__(cls, unicode(content, codepage).encode('utf-8'))

   def __repr__(self):
       r''' # note that we use raw strings to avoid having to use double back slashes below
       NOTE! This function is a clone of web2py:gluon.languages.utf_repl() function

       utf8.__repr__() works same as str.repr() when processing ascii string
       >>> repr(Utf8('abc')) == repr(Utf8("abc")) == repr('abc') == repr("abc") == "'abc'"
       True
       >>> repr(Utf8('a"b"c')) == repr('a"b"c') == '\'a"b"c\''
       True
       >>> repr(Utf8("a'b'c")) == repr("a'b'c") == '"a\'b\'c"'
       True
       >>> repr(Utf8('a\'b"c')) == repr('a\'b"c') == repr(Utf8("a'b\"c")) == repr("a'b\"c") == '\'a\\\'b"c\''
       True
       >>> repr(Utf8('a\r\nb')) == repr('a\r\nb') == "'a\\r\\nb'" # Test for \r, \n
       True

       Unlike str.repr(), Utf8.__repr__() remains utf8 content when processing utf8 string
       >>> repr(Utf8('中文字')) == repr(Utf8("中文字")) == "'中文字'" != repr('中文字')
       True
       >>> repr(Utf8('中"文"字')) == "'中\"文\"字'" != repr('中"文"字')
       True
       >>> repr(Utf8("中'文'字")) == '"中\'文\'字"' != repr("中'文'字")
       True
       >>> repr(Utf8('中\'文"字')) == repr(Utf8("中'文\"字")) == '\'中\\\'文"字\'' != repr('中\'文"字') == repr("中'文\"字")
       True
       >>> repr(Utf8('中\r\n文')) == "'中\\r\\n文'" != repr('中\r\n文') # Test for \r, \n
       True
       '''
       if str.find(self,"'") >= 0 and str.find(self,'"') < 0: # only single quote exists
           return '"'+unicode(self, 'utf-8').translate(repr_escape_tab).encode('utf-8')+'"'
       else:
           return "'"+unicode(self, 'utf-8').translate(repr_escape_tab2).encode('utf-8')+"'"

   def __size__(self):
       """ length of utf-8 string in bytes """
       return str.__len__(self)

   def __contains__(self, other):      return str.__contains__(self, Utf8(other))
   def __getitem__(self, index):       return str.__new__(Utf8, unicode(self, 'utf-8')[index].encode('utf-8'))
   def __getslice__(self, begin, end): return str.__new__(Utf8, unicode(self, 'utf-8')[begin:end].encode('utf-8'))

   def __add__(self, other):  return str.__new__(Utf8, str.__add__(self, unicode.encode(other, 'utf-8')
                                                        if isinstance(other, unicode) else other))
   def __len__(self):         return len(unicode(self, 'utf-8'))
   def __mul__(self, integer):return str.__new__(Utf8, str.__mul__(self, integer))
   def __eq__(self, string):  return str.__eq__(self, Utf8(string))
   def __ne__(self, string):  return str.__ne__(self, Utf8(string))
   def capitalize(self):      return str.__new__(Utf8, unicode(self, 'utf-8').capitalize().encode('utf-8'))
   def center(self, length):  return str.__new__(Utf8, unicode(self, 'utf-8').center(length).encode('utf-8'))
   def upper(self):           return str.__new__(Utf8, unicode(self, 'utf-8').upper().encode('utf-8'))
   def lower(self):           return str.__new__(Utf8, unicode(self, 'utf-8').lower().encode('utf-8'))
   def title(self):           return str.__new__(Utf8, unicode(self, 'utf-8').title().encode('utf-8'))
   def index(self, string):   return unicode(self, 'utf-8').index(string if isinstance(string,unicode) else unicode(string, 'utf-8'))
   def isalnum(self):         return unicode(self, 'utf-8').isalnum()
   def isalpha(self):         return unicode(self, 'utf-8').isalpha()
   def isdigit(self):         return unicode(self, 'utf-8').isdigit()
   def islower(self):         return unicode(self, 'utf-8').islower()
   def isspace(self):         return unicode(self, 'utf-8').isspace()
   def istitle(self):         return unicode(self, 'utf-8').istitle()
   def isupper(self):         return unicode(self, 'utf-8').isupper()
   def zfill(self, length):   return str.__new__(Utf8, unicode(self, 'utf-8').zfill(length).encode('utf-8'))
   def join(self, iter):      return str.__new__(Utf8, str.join(self, [Utf8(c) for c in
                                                                       list(unicode(iter, 'utf-8') if
                                                                            isinstance(iter, str) else
                                                                            iter)]))
   def lstrip(self, chars=None):   return str.__new__(Utf8, str.lstrip(self, None if chars is None else Utf8(chars)))
   def rstrip(self, chars=None ):  return str.__new__(Utf8, str.rstrip(self, None if chars is None else Utf8(chars)))
   def strip(self,  chars=None):   return str.__new__(Utf8, str.strip(self, None if chars is None else Utf8(chars)))
   def swapcase(self):             return str.__new__(Utf8, unicode(self, 'utf-8').swapcase().encode('utf-8'))

   def count(self, sub, start=0, end=None):
       unistr = unicode(self, 'utf-8')
       return unistr.count(unicode(sub, 'utf-8') if isinstance(sub, str) else sub,
                              start, len(unistr) if end is None else end)
   def decode(self, encoding='utf-8', errors='strict'): return str.decode(self, encoding, errors)
   def encode(self, encoding, errors='strict'): return unicode(self, 'utf-8').encode(encoding, errors)
   def expandtabs(self, tabsize=8): return str.__new__(Utf8, unicode(self, 'utf-8').expandtabs(tabsize).encode('utf-8'))
   def find(self, sub, start=None, end=None): return unicode(self, 'utf-8').find(unicode(sub, 'utf-8')
                                                     if isinstance(sub, str) else sub, start, end)
   def ljust(self, width, fillchar=' '): return str.__new__(Utf8, unicode(self, 'utf-8').ljust(width, unicode(fillchar, 'utf-8')
                                                              if isinstance(fillchar, str) else fillchar).encode('utf-8'))
   def partition(self, sep):
       (head, sep, tail) = str.partition(self, Utf8(sep))
       return ( str.__new__(Utf8, head),
                str.__new__(Utf8, sep),
                str.__new__(Utf8, tail) )

   def replace(self, old, new, count=-1): return str.__new__(Utf8, str.replace(self, Utf8(old), Utf8(new), count))
   def rfind(self, sub, start=None, end=None): return unicode(self, 'utf-8').rfind(unicode(sub, 'utf-8')
                                                              if isinstance(sub, str) else sub, start, end)
   def rindex(self, string):   return unicode(self, 'utf-8').rindex(string if isinstance(string,unicode)
                                                                         else unicode(string, 'utf-8'))
   def rjust(self, width, fillchar=' '): return str.__new__(Utf8, unicode(self, 'utf-8').rjust(width, unicode(fillchar, 'utf-8')
                                                              if isinstance(fillchar, str) else fillchar).encode('utf-8'))
   def rpartition(self, sep):
       (head, sep, tail) = str.rpartition(self, Utf8(sep))
       return ( str.__new__(Utf8, head),
                str.__new__(Utf8, sep),
                str.__new__(Utf8, tail) )
   def rsplit(self, sep=None, maxsplit=-1): return [str.__new__(Utf8, part) for part in str.rsplit(self,
                                                                 None if sep is None else Utf8(sep), maxsplit)]
   def split(self, sep=None, maxsplit=-1): return [str.__new__(Utf8, part) for part in str.split(self,
                                                                 None if sep is None else Utf8(sep), maxsplit)]
   def splitlines(self,keepends=False): return [str.__new__(Utf8, part) for part in str.splitlines(self,keepends)]
   def startswith(self, prefix, start=0, end=None):
       unistr = unicode(self, 'utf-8')
       if isinstance(prefix, tuple):
           prefix = tuple(unicode(s,'utf-8') if isinstance(s, str) else s for s in prefix)
       elif isinstance(prefix, str):
           prefix = unicode(prefix, 'utf-8')
       return unistr.startswith(prefix, start, len(unistr) if end is None else end)
   def translate(self, table, deletechars=''):
       if isinstance(table, dict):
           return str.__new__(Utf8, unicode(self, 'utf-8').translate(table).encode('utf-8'))
       else:
           return str.__new__(Utf8, str.translate(self, table, deletechars))
   def endswith(self, prefix, start=0, end=None):
       unistr = unicode(self, 'utf-8')
       if isinstance(prefix, tuple):
           prefix = tuple(unicode(s,'utf-8') if isinstance(s, str) else s for s in prefix)
       elif isinstance(prefix, str):
           prefix = unicode(prefix, 'utf-8')
       return unistr.endswith(prefix, start, len(unistr) if end is None else end)
   if hasattr(str, 'format'): # Python 2.5 hasn't got str.format() method
       def format(self, *args, **kwargs):
           args = [unicode(s, 'utf-8') if isinstance(s, str) else s for s in args]
           kwargs = dict((unicode(k, 'utf-8') if isinstance(k, str) else k,
                          unicode(v, 'utf-8') if isinstance(v, str) else v)
                             for k,v in kwargs.iteritems())
           return str.__new__(Utf8, unicode(self, 'utf-8').
                                         format(*args, **kwargs).encode('utf-8'))
   def __mod__(self, right):
       if isinstance(right, tuple):
           right = tuple(unicode(v, 'utf-8') if isinstance(v, str) else v
                             for v in right)
       elif isinstance(right, dict):
           right = dict((unicode(k, 'utf-8') if isinstance(k, str) else k,
                          unicode(v, 'utf-8') if isinstance(v, str) else v)
                             for k,v in right.iteritems())
       elif isinstance(right, str):
           right = unicode(right, 'utf-8')
       return str.__new__(Utf8, unicode(self, 'utf-8').__mod__(right).encode('utf-8'))
   def __ge__(self, string): return sort_key(self) >= sort_key(string)
   def __gt__(self, string): return sort_key(self) >  sort_key(string)
   def __le__(self, string): return sort_key(self) <= sort_key(string)
   def __lt__(self, string): return sort_key(self) <  sort_key(string)


if __name__ == '__main__':
    def doctests():
       u"""
       doctests:
       >>> test_unicode=u'ПРоба Є PRobe'
       >>> test_unicode_word=u'ПРоба'
       >>> test_number_str='12345'
       >>> test_unicode
       u'\\u041f\\u0420\\u043e\\u0431\\u0430 \\u0404 PRobe'
       >>> print test_unicode
       ПРоба Є PRobe
       >>> test_word=test_unicode_word.encode('utf-8')
       >>> test_str=test_unicode.encode('utf-8')
       >>> s=Utf8(test_str)
       >>> s
       'ПРоба Є PRobe'
       >>> type(s)
       <class '__main__.Utf8'>
       >>> s == test_str
       True
       >>> len(test_str) # wrong length of utf8-string!
       19
       >>> len(test_unicode) # RIGHT!
       13
       >>> len(s) # RIGHT!
       13
       >>> size(test_str) # size of utf-8 string (in bytes) == len(str)
       19
       >>> size(test_unicode) # size of unicode string in bytes (packed to utf-8 string)
       19
       >>> size(s) # size of utf-8 string in bytes
       19
       >>> try: # utf-8 is a multibyte string. Convert it to unicode for use with builtin ord()
       ...     __builtin__.ord('б')  #  ascii string
       ... except Exception, e:
       ...     print 'Exception:', e
       Exception: ord() expected a character, but string of length 2 found
       >>> ord('б') # utf8.ord() is used(!!!)
       1073
       >>> ord(u'б') # utf8.ord() is used(!!!)
       1073
       >>> ord(s[3])  # utf8.ord() is used(!!!)
       1073
       >>> chr(ord(s[3])) # utf8.chr() and utf8.chr() is used(!!!)
       'б'
       >>> type(chr(1073))  # utf8.chr() is used(!!!)
       <class '__main__.Utf8'>
       >>> s=Utf8(test_unicode)
       >>> s
       'ПРоба Є PRobe'
       >>> s == test_str
       True
       >>> test_str == s
       True
       >>> s == test_unicode
       True
       >>> test_unicode == s
       True
       >>> print test_str.upper() # only ASCII characters uppered
       ПРоба Є PROBE
       >>> print test_unicode.upper() # unicode gives right result
       ПРОБА Є PROBE
       >>> s.upper() # utf8 class use unicode.upper()
       'ПРОБА Є PROBE'
       >>> type(s.upper())
       <class '__main__.Utf8'>
       >>> s.lower()
       'проба є probe'
       >>> type(s.lower())
       <class '__main__.Utf8'>
       >>> s.capitalize()
       'Проба є probe'
       >>> type(s.capitalize())
       <class '__main__.Utf8'>
       >>> len(s)
       13
       >>> len(test_unicode)
       13
       >>> s+'. Probe is проба'
       'ПРоба Є PRobe. Probe is проба'
       >>> type(s+'. Probe is проба')
       <class '__main__.Utf8'>
       >>> s+u'. Probe is проба'
       'ПРоба Є PRobe. Probe is проба'
       >>> type(s+u'. Probe is проба')
       <class '__main__.Utf8'>
       >>> s+s
       'ПРоба Є PRobeПРоба Є PRobe'
       >>> type(s+s)
       <class '__main__.Utf8'>
       >>> a=s
       >>> a+=s
       >>> a+=test_unicode
       >>> a+=test_str
       >>> a
       'ПРоба Є PRobeПРоба Є PRobeПРоба Є PRobeПРоба Є PRobe'
       >>> type(a)
       <class '__main__.Utf8'>
       >>> s*3
       'ПРоба Є PRobeПРоба Є PRobeПРоба Є PRobe'
       >>> type(s*3)
       <class '__main__.Utf8'>
       >>> a=Utf8("-проба-")
       >>> a*=10
       >>> a
       '-проба--проба--проба--проба--проба--проба--проба--проба--проба--проба-'
       >>> type(a)
       <class '__main__.Utf8'>
       >>> print "'"+test_str.center(17)+"'" # WRONG RESULT!
       'ПРоба Є PRobe'
       >>> s.center(17) # RIGHT!
       '  ПРоба Є PRobe  '
       >>> type(s.center(17))
       <class '__main__.Utf8'>
       >>> (test_word+test_number_str).isalnum() # WRONG RESULT! non ASCII chars are detected as non alpha
       False
       >>> Utf8(test_word+test_number_str).isalnum()
       True
       >>> s.isalnum()
       False
       >>> test_word.isalpha() # WRONG RESULT! Non ASCII characters are detected as non alpha
       False
       >>> Utf8(test_word).isalpha() # RIGHT!
       True
       >>> s.lower().islower()
       True
       >>> s.upper().isupper()
       True
       >>> print test_str.zfill(17) # WRONG RESULT!
       ПРоба Є PRobe
       >>> s.zfill(17) # RIGHT!
       '0000ПРоба Є PRobe'
       >>> type(s.zfill(17))
       <class '__main__.Utf8'>
       >>> s.istitle()
       False
       >>> s.title().istitle()
       True
       >>> Utf8('1234').isdigit()
       True
       >>> Utf8(' \t').isspace()
       True
       >>> s.join('•|•')
       '•ПРоба Є PRobe|ПРоба Є PRobe•'
       >>> s.join((str('(utf8 тест1)'), unicode('(unicode тест2)','utf-8'), '(ascii test3)'))
       '(utf8 тест1)ПРоба Є PRobe(unicode тест2)ПРоба Є PRobe(ascii test3)'
       >>> type(s)
       <class '__main__.Utf8'>
       >>> s==test_str
       True
       >>> s==test_unicode
       True
       >>> s.swapcase()
       'прОБА є prOBE'
       >>> type(s.swapcase())
       <class '__main__.Utf8'>
       >>> truncate(s, 10)
       'ПРоба Є...'
       >>> truncate(s, 20)
       'ПРоба Є PRobe'
       >>> truncate(s, 10, '•••') # utf-8 string as *dots*
       'ПРоба Є•••'
       >>> truncate(s, 10, u'®') # you can use unicode string as *dots*
       'ПРоба Є P®'
       >>> type(truncate(s, 10))
       <class '__main__.Utf8'>
       >>> Utf8(s.encode('koi8-u'), 'koi8-u')
       'ПРоба Є PRobe'
       >>> s.decode() # convert utf-8 string to unicode
       u'\\u041f\\u0420\\u043e\\u0431\\u0430 \\u0404 PRobe'
       >>> a='про\\tba'
       >>> str_tmp=a.expandtabs()
       >>> utf8_tmp=Utf8(a).expandtabs()
       >>> utf8_tmp.replace(' ','.') # RIGHT! (default tabsize is 8)
       'про.....ba'
       >>> utf8_tmp.index('b')
       8
       >>> print "'"+str_tmp.replace(' ','.')+"'" # WRONG STRING LENGTH!
       'про..ba'
       >>> str_tmp.index('b') # WRONG index of 'b' character
       8
       >>> print "'"+a.expandtabs(4).replace(' ','.')+"'" # WRONG RESULT!
       'про..ba'
       >>> Utf8(a).expandtabs(4).replace(' ','.') # RIGHT!
       'про.ba'
       >>> s.find('Є')
       6
       >>> s.find(u'Є')
       6
       >>> s.find(' ', 6)
       7
       >>> s.rfind(' ')
       7
       >>> s.partition('Є')
       ('ПРоба ', 'Є', ' PRobe')
       >>> s.partition(u'Є')
       ('ПРоба ', 'Є', ' PRobe')
       >>> (a,b,c) = s.partition('Є')
       >>> type(a), type(b), type(c)
       (<class '__main__.Utf8'>, <class '__main__.Utf8'>, <class '__main__.Utf8'>)
       >>> s.partition(' ')
       ('ПРоба', ' ', 'Є PRobe')
       >>> s.rpartition(' ')
       ('ПРоба Є', ' ', 'PRobe')
       >>> s.index('Є')
       6
       >>> s.rindex(u'Є')
       6
       >>> s.index(' ')
       5
       >>> s.rindex(' ')
       7
       >>> a=Utf8('а б ц д е а б ц д е а\\tб ц д е')
       >>> a.split()
       ['а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е']
       >>> a.rsplit()
       ['а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е', 'а', 'б', 'ц', 'д', 'е']
       >>> a.expandtabs().split('б')
       ['а ', ' ц д е а ', ' ц д е а   ', ' ц д е']
       >>> a.expandtabs().rsplit('б')
       ['а ', ' ц д е а ', ' ц д е а   ', ' ц д е']
       >>> a.expandtabs().split(u'б', 1)
       ['а ', ' ц д е а б ц д е а   б ц д е']
       >>> a.expandtabs().rsplit(u'б', 1)
       ['а б ц д е а б ц д е а   ', ' ц д е']
       >>> a=Utf8("рядок1\\nрядок2\\nрядок3")
       >>> a.splitlines()
       ['рядок1', 'рядок2', 'рядок3']
       >>> a.splitlines(True)
       ['рядок1\\n', 'рядок2\\n', 'рядок3']
       >>> s[6]
       'Є'
       >>> s[0]
       'П'
       >>> s[-1]
       'e'
       >>> s[:10]
       'ПРоба Є PR'
       >>> s[2:-2:2]
       'оаЄPo'
       >>> s[::-1]
       'eboRP Є абоРП'
       >>> s.startswith('ПР')
       True
       >>> s.startswith(('ПР', u'об'),0)
       True
       >>> s.startswith(u'об', 2, 4)
       True
       >>> s.endswith('be')
       True
       >>> s.endswith(('be', 'PR', u'Є'))
       True
       >>> s.endswith('PR', 8, 10)
       True
       >>> s.endswith('Є', -7, -6)
       True
       >>> s.count(' ')
       2
       >>> s.count(' ',6)
       1
       >>> s.count(u'Є')
       1
       >>> s.count('Є', 0, 5)
       0
       >>> Utf8("Parameters: '%(проба)s', %(probe)04d, %(проба2)s") % { u"проба": s,
       ...      "not used": "???", "probe":  2, "проба2": u"ПРоба Probe" }
       "Parameters: 'ПРоба Є PRobe', 0002, ПРоба Probe"
       >>> a=Utf8(u"Параметр: (%s)-(%s)-[%s]")
       >>> a%=(s, s[::-1], 1000)
       >>> a
       'Параметр: (ПРоба Є PRobe)-(eboRP Є абоРП)-[1000]'
       >>> if hasattr(Utf8,  'format'):
       ...     Utf8("Проба <{0}>, {1}, {param1}, {param2}").format(s, u"中文字",
       ...           param1="барабан", param2=1000) == 'Проба <ПРоба Є PRobe>, 中文字, барабан, 1000'
       ... else: # format() method is not used in python with version <2.6:
       ...     print True
       True
       >>> u'Б'<u'Ї' # WRONG ORDER!
       False
       >>> 'Б'<'Ї' # WRONG ORDER!
       False
       >>> Utf8('Б')<'Ї' # RIGHT!
       True
       >>> u'д'>u'ґ' # WRONG ORDER!
       False
       >>> Utf8('д')>Utf8('ґ') # RIGHT!
       True
       >>> u'є'<=u'ж' # WRONG ORDER!
       False
       >>> Utf8('є')<=u'ж' # RIGHT!
       True
       >>> Utf8('є')<=u'є'
       True
       >>> u'Ї'>=u'И' # WRONG ORDER!
       False
       >>> Utf8(u'Ї') >= u'И' # RIGHT
       True
       >>> Utf8('Є') >= 'Є'
       True
       >>> a="яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ"  # str type
       >>> b=u"яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ" # unicode type
       >>> c=Utf8("яжертиуіопшщїасдфгґхйклчєзьцвбнмюЯЖЕРТИУІОПШЩЇАСДФГҐХЙКЛЧЗЬЦВБНМЮЄ") # utf8 class
       >>> result = "".join(sorted(a))
       >>> result[0:20] # result is not utf8 string, because bytes, not utf8-characters were sorted
       '\\x80\\x81\\x82\\x83\\x84\\x84\\x85\\x86\\x86\\x87\\x87\\x88\\x89\\x8c\\x8e\\x8f\\x90\\x90\\x91\\x91'
       >>> try:
       ...   unicode(result, 'utf-8') # try to convert result (utf-8?) to unicode
       ... except Exception, e:
       ...    print 'Exception:', e
       Exception: 'utf8' codec can't decode byte 0x80 in position 0: unexpected code byte
       >>> try: # FAILED! (working with bytes, not with utf8-charactes)
       ...    "".join( sorted(a, key=sort_key) ) # utf8.sort_key may be used with utf8 or unicode strings only!
       ... except Exception, e:
       ...    print 'Exception:', e
       Exception: 'utf8' codec can't decode byte 0xd1 in position 0: unexpected end of data
       >>> print "".join( sorted(Utf8(a))) # converting *a* to unicode or utf8-string gives us correct result
       аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
       >>> print u"".join( sorted(b) ) # WRONG ORDER! Default sort key is used
       ЄІЇАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЬЮЯабвгдежзийклмнопрстуфхцчшщьюяєіїҐґ
       >>> print u"".join( sorted(b, key=sort_key) ) # RIGHT ORDER! utf8.sort_key is used
       аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
       >>> print "".join( sorted(c) ) # RIGHT ORDER! Utf8 "rich comparison" methods are used
       аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
       >>> print "".join( sorted(c, key=sort_key) ) # RIGHT ORDER! utf8.sort_key is used
       аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ
       >>> Utf8().join(sorted(c.decode(), key=sort_key)) # convert to unicode for better performance
       'аАбБвВгГґҐдДеЕєЄжЖзЗиИіІїЇйЙкКлЛмМнНоОпПрРсСтТуУфФхХцЦчЧшШщЩьЬюЮяЯ'
       >>> for result in sorted(["Іа", "Астро", u"гала", Utf8("Гоша"), "Єва", "шовк", "аякс", "Їжа",
       ...                       "ґанок", Utf8("Дар'я"), "білінг", "веб", u"Жужа", "проба", u"тест",
       ...                       "абетка", "яблуко", "Юляся", "Київ", "лимонад", "ложка", "Матриця",
       ...                      ], key=sort_key):
       ...     print result.ljust(20), type(result)
       абетка         <type 'str'>
       Астро           <type 'str'>
       аякс             <type 'str'>
       білінг         <type 'str'>
       веб               <type 'str'>
       гала                 <type 'unicode'>
       ґанок           <type 'str'>
       Гоша                 <class '__main__.Utf8'>
       Дар'я                <class '__main__.Utf8'>
       Єва               <type 'str'>
       Жужа                 <type 'unicode'>
       Іа                 <type 'str'>
       Їжа               <type 'str'>
       Київ             <type 'str'>
       лимонад       <type 'str'>
       ложка           <type 'str'>
       Матриця       <type 'str'>
       проба           <type 'str'>
       тест                 <type 'unicode'>
       шовк             <type 'str'>
       Юляся           <type 'str'>
       яблуко         <type 'str'>
       >>> a=Utf8("中文字")
       >>> L=list(a)
       >>> L
       ['中', '文', '字']
       >>> a="".join(L)
       >>> print a
       中文字
       >>> type(a)
       <type 'str'>
       >>> a="中文字"  # standard str type
       >>> L=list(a)
       >>> L
       ['\\xe4', '\\xb8', '\\xad', '\\xe6', '\\x96', '\\x87', '\\xe5', '\\xad', '\\x97']
       >>> from string import maketrans
       >>> str_tab=maketrans('PRobe','12345')
       >>> unicode_tab={ord(u'П'):ord(u'Ж'),
       ...              ord(u'Р')      : u'Ш',
       ...              ord(Utf8('о')) : None,  # utf8.ord() is used
       ...              ord('б')       : None,  # -//-//-
       ...              ord(u'а')      : u"中文字",
       ...              ord(u'Є')      : Utf8('•').decode(), # only unicode type is supported
       ...             }
       >>> s.translate(unicode_tab).translate(str_tab, deletechars=' ')
       'ЖШ中文字•12345'
       """
       import sys
       reload(sys)
       sys.setdefaultencoding("UTF-8")
       import doctest
       print "DOCTESTS STARTED..."
       doctest.testmod()
       print "DOCTESTS FINISHED"

    doctests()



